1 package org.apache.solr.spelling;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.Iterator;
24 import java.util.List;
25 import java.util.Locale;
26 import java.util.regex.Pattern;
27
28 import org.apache.lucene.analysis.Token;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.search.spell.CombineSuggestion;
32 import org.apache.lucene.search.spell.SuggestWord;
33 import org.apache.lucene.search.spell.WordBreakSpellChecker;
34 import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
35 import org.apache.solr.common.util.NamedList;
36 import org.apache.solr.core.SolrCore;
37 import org.apache.solr.search.SolrIndexSearcher;
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 public class WordBreakSolrSpellChecker extends SolrSpellChecker {
58
59
60
61
62
63 public static final String PARAM_COMBINE_WORDS = "combineWords";
64
65
66
67
68
69 public static final String PARAM_BREAK_WORDS = "breakWords";
70
71
72
73 public static final String PARAM_MAX_CHANGES = "maxChanges";
74
75
76
77 public static final String PARAM_MAX_COMBINE_WORD_LENGTH = "maxCombinedLength";
78
79
80
81 public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";
82
83
84
85 public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER = "breakSugestionTieBreaker";
86
87
88
89 public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";
90
91
92
93 public static final String PARAM_MIN_SUGGESTION_FREQUENCY = "minSuggestionFreq";
94
95
96
97
98
99
100
101 public enum BreakSuggestionTieBreaker {
102
103
104
105
106
107 MAX_FREQ,
108
109
110
111
112 SUM_FREQ
113 };
114
115 private WordBreakSpellChecker wbsp = null;
116 private boolean combineWords = false;
117 private boolean breakWords = false;
118 private BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
119 private static final Pattern spacePattern = Pattern.compile("\\s+");
120
121 @Override
122 public String init(@SuppressWarnings("unchecked") NamedList config,
123 SolrCore core) {
124 String name = super.init(config, core);
125 combineWords = boolParam(config, PARAM_COMBINE_WORDS);
126 breakWords = boolParam(config, PARAM_BREAK_WORDS);
127 wbsp = new WordBreakSpellChecker();
128 String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
129 if (bstb != null) {
130 bstb = bstb.toUpperCase(Locale.ROOT);
131 if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
132 sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_SUMMED_FREQUENCY;
133 } else if (bstb.equals(BreakSuggestionTieBreaker.MAX_FREQ.name())) {
134 sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
135 } else {
136 throw new IllegalArgumentException("Invalid value for parameter "
137 + PARAM_BREAK_SUGGESTION_TIE_BREAKER + " : " + bstb);
138 }
139 } else {
140 sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
141 }
142 int mc = intParam(config, PARAM_MAX_CHANGES);
143 if (mc > 0) {
144 wbsp.setMaxChanges(mc);
145 }
146 int mcl = intParam(config, PARAM_MAX_COMBINE_WORD_LENGTH);
147 if (mcl > 0) {
148 wbsp.setMaxCombineWordLength(mcl);
149 }
150 int mbwl = intParam(config, PARAM_MIN_BREAK_WORD_LENGTH);
151 if (mbwl > 0) {
152 wbsp.setMinBreakWordLength(mbwl);
153 }
154 int me = intParam(config, PARAM_MAX_EVALUATIONS);
155 if (me > 0) {
156 wbsp.setMaxEvaluations(me);
157 }
158 int msf = intParam(config, PARAM_MIN_SUGGESTION_FREQUENCY);
159 if (msf > 0) {
160 wbsp.setMinSuggestionFrequency(msf);
161 }
162 return name;
163 }
164
165 private String strParam(@SuppressWarnings("unchecked") NamedList config,
166 String paramName) {
167 Object o = config.get(paramName);
168 return o == null ? null : o.toString();
169 }
170
171 private boolean boolParam(@SuppressWarnings("unchecked") NamedList config,
172 String paramName) {
173 String s = strParam(config, paramName);
174 if ("true".equalsIgnoreCase(s) || "on".equalsIgnoreCase(s)) {
175 return true;
176 }
177 return false;
178 }
179
180 private int intParam(@SuppressWarnings("unchecked") NamedList config,
181 String paramName) {
182 Object o = config.get(paramName);
183 if (o == null) {
184 return 0;
185 }
186 try {
187 return Integer.parseInt(o.toString());
188 } catch (NumberFormatException nfe) {
189 throw new IllegalArgumentException("Invalid integer for parameter "
190 + paramName + " : " + o);
191 }
192 }
193
194 @Override
195 public SpellingResult getSuggestions(SpellingOptions options)
196 throws IOException {
197 IndexReader ir = options.reader;
198 int numSuggestions = options.count;
199
200 StringBuilder sb = new StringBuilder();
201 Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
202 List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
203
204 List<ResultEntry> breakSuggestionList = new ArrayList<>();
205 List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
206 boolean lastOneProhibited = false;
207 boolean lastOneRequired = false;
208 boolean lastOneprocedesNewBooleanOp = false;
209 for (int i = 0; i < tokenArr.length; i++) {
210 boolean prohibited =
211 (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) ==
212 QueryConverter.PROHIBITED_TERM_FLAG;
213 boolean required =
214 (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) ==
215 QueryConverter.REQUIRED_TERM_FLAG;
216 boolean procedesNewBooleanOp =
217 (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) ==
218 QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
219 if (i > 0
220 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
221 termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
222 }
223 lastOneProhibited = prohibited;
224 lastOneRequired = required;
225 lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
226
227 Term thisTerm = new Term(field, tokenArr[i].toString());
228 termArr.add(thisTerm);
229 if (breakWords) {
230 SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm,
231 numSuggestions, ir, options.suggestMode, sortMethod);
232 if(breakSuggestions.length==0) {
233 noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
234 }
235 for (SuggestWord[] breakSuggestion : breakSuggestions) {
236 sb.delete(0, sb.length());
237 boolean firstOne = true;
238 int freq = 0;
239 for (SuggestWord word : breakSuggestion) {
240 if (!firstOne) {
241 sb.append(" ");
242 }
243 firstOne = false;
244 sb.append(word.string);
245 if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
246 freq = Math.max(freq, word.freq);
247 } else {
248 freq += word.freq;
249 }
250 }
251 breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(),
252 freq));
253 }
254 }
255 }
256 breakSuggestionList.addAll(noBreakSuggestionList);
257
258 List<ResultEntry> combineSuggestionList = Collections.emptyList();
259 CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(
260 termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
261 if (combineWords) {
262 combineSuggestionList = new ArrayList<>(
263 combineSuggestions.length);
264 for (CombineSuggestion cs : combineSuggestions) {
265 int firstTermIndex = cs.originalTermIndexes[0];
266 int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
267 sb.delete(0, sb.length());
268 for (int i = firstTermIndex; i <= lastTermIndex; i++) {
269 if (i > firstTermIndex) {
270 sb.append(" ");
271 }
272 sb.append(tokenArr[i].toString());
273 }
274 Token token = new Token(sb.toString(), tokenArr[firstTermIndex]
275 .startOffset(), tokenArr[lastTermIndex].endOffset());
276 combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string,
277 cs.suggestion.freq));
278 }
279 }
280
281
282 SpellingResult result = new SpellingResult();
283 Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
284 Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
285 ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
286 ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
287 int breakCount = 0;
288 int combineCount = 0;
289 while (lastBreak != null || lastCombine != null) {
290 if (lastBreak == null) {
291 addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
292 lastCombine = null;
293 } else if (lastCombine == null) {
294 addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
295 lastBreak = null;
296 } else if (lastBreak.freq < lastCombine.freq) {
297 addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
298 lastCombine = null;
299 } else if (lastCombine.freq < lastBreak.freq) {
300 addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
301 lastBreak = null;
302 } else if (breakCount >= combineCount) {
303 addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
304 lastCombine = null;
305 } else {
306 addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
307 lastBreak = null;
308 }
309 if (lastBreak == null && breakIter.hasNext()) {
310 lastBreak = breakIter.next();
311 breakCount++;
312 }
313 if (lastCombine == null && combineIter.hasNext()) {
314 lastCombine = combineIter.next();
315 combineCount++;
316 }
317 }
318 return result;
319 }
320 private void addToResult(SpellingResult result, Token token, int tokenFrequency, String suggestion, int suggestionFrequency) {
321 if(suggestion==null) {
322 result.add(token, Collections.<String>emptyList());
323 result.addFrequency(token, tokenFrequency);
324 } else {
325 result.add(token, suggestion, suggestionFrequency);
326 result.addFrequency(token, tokenFrequency);
327 }
328 }
329
330 private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
331 String[] words = spacePattern.split(token.toString());
332 int result = 0;
333 if(sortMethod==BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
334 for(String word : words) {
335 result = Math.max(result, ir.docFreq(new Term(field, word)));
336 }
337 } else {
338 for(String word : words) {
339 result += ir.docFreq(new Term(field, word));
340 }
341 }
342 return result;
343 }
344
345 @Override
346 public void build(SolrCore core, SolrIndexSearcher searcher) {
347
348 }
349
350 @Override
351 public void reload(SolrCore core, SolrIndexSearcher searcher)
352 throws IOException {
353
354 }
355
356 @Override
357 public boolean isSuggestionsMayOverlap() {
358 return true;
359 }
360 }